1 package org.apache.lucene.util.automaton;
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 import java.util.ArrayList;
21 import java.util.List;
22
23 import org.apache.lucene.util.LuceneTestCase;
24
25 import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES;
26
27 public class TestLevenshteinAutomata extends LuceneTestCase {
28
29 public void testLev0() throws Exception {
30 assertLev("", 0);
31 assertCharVectors(0);
32 }
33
34 public void testLev1() throws Exception {
35 assertLev("", 1);
36 assertCharVectors(1);
37 }
38
39 public void testLev2() throws Exception {
40 assertLev("", 2);
41 assertCharVectors(2);
42 }
43
44
45 public void testNoWastedStates() throws Exception {
46 assertFalse(Operations.hasDeadStatesFromInitial(new LevenshteinAutomata("abc", false).toAutomaton(1)));
47 }
48
49
50
51
52
53 private void assertCharVectors(int n) {
54 int k = 2*n + 1;
55
56
57 int limit = (int) Math.pow(2, k + 2);
58 for (int i = 0; i < limit; i++) {
59 String encoded = Integer.toString(i, 2);
60 assertLev(encoded, n);
61 }
62 }
63
64
65
66
67
68 private void assertLev(String s, int maxDistance) {
69 LevenshteinAutomata builder = new LevenshteinAutomata(s, false);
70 LevenshteinAutomata tbuilder = new LevenshteinAutomata(s, true);
71 Automaton automata[] = new Automaton[maxDistance + 1];
72 Automaton tautomata[] = new Automaton[maxDistance + 1];
73 for (int n = 0; n < automata.length; n++) {
74 automata[n] = builder.toAutomaton(n);
75 tautomata[n] = tbuilder.toAutomaton(n);
76 assertNotNull(automata[n]);
77 assertNotNull(tautomata[n]);
78 assertTrue(automata[n].isDeterministic());
79 assertTrue(tautomata[n].isDeterministic());
80 assertTrue(Operations.isFinite(automata[n]));
81 assertTrue(Operations.isFinite(tautomata[n]));
82 assertFalse(Operations.hasDeadStatesFromInitial(automata[n]));
83 assertFalse(Operations.hasDeadStatesFromInitial(tautomata[n]));
84
85 if (n > 0) {
86 assertTrue(Operations.subsetOf(Operations.removeDeadStates(automata[n-1]),
87 Operations.removeDeadStates(automata[n])));
88 assertTrue(Operations.subsetOf(Operations.removeDeadStates(automata[n-1]),
89 Operations.removeDeadStates(tautomata[n])));
90 assertTrue(Operations.subsetOf(Operations.removeDeadStates(tautomata[n-1]),
91 Operations.removeDeadStates(automata[n])));
92 assertTrue(Operations.subsetOf(Operations.removeDeadStates(tautomata[n-1]),
93 Operations.removeDeadStates(tautomata[n])));
94 assertNotSame(automata[n-1], automata[n]);
95 }
96
97 assertTrue(Operations.subsetOf(Operations.removeDeadStates(automata[n]),
98 Operations.removeDeadStates(tautomata[n])));
99
100 switch(n) {
101 case 0:
102
103 assertTrue(Operations.sameLanguage(Automata.makeString(s), Operations.removeDeadStates(automata[0])));
104 assertTrue(Operations.sameLanguage(Automata.makeString(s), Operations.removeDeadStates(tautomata[0])));
105 break;
106 case 1:
107
108 assertTrue(Operations.sameLanguage(naiveLev1(s), Operations.removeDeadStates(automata[1])));
109 assertTrue(Operations.sameLanguage(naiveLev1T(s), Operations.removeDeadStates(tautomata[1])));
110 break;
111 default:
112 assertBruteForce(s, automata[n], n);
113 assertBruteForceT(s, tautomata[n], n);
114 break;
115 }
116 }
117 }
118
119
120
121
122
123 private Automaton naiveLev1(String s) {
124 Automaton a = Automata.makeString(s);
125 a = Operations.union(a, insertionsOf(s));
126 a = MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
127 a = Operations.union(a, deletionsOf(s));
128 a = MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
129 a = Operations.union(a, substitutionsOf(s));
130 a = MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
131
132 return a;
133 }
134
135
136
137
138
139 private Automaton naiveLev1T(String s) {
140 Automaton a = naiveLev1(s);
141 a = Operations.union(a, transpositionsOf(s));
142 a = MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
143 return a;
144 }
145
146
147
148
149
150 private Automaton insertionsOf(String s) {
151 List<Automaton> list = new ArrayList<>();
152
153 for (int i = 0; i <= s.length(); i++) {
154 Automaton a = Automata.makeString(s.substring(0, i));
155 a = Operations.concatenate(a, Automata.makeAnyChar());
156 a = Operations.concatenate(a, Automata.makeString(s.substring(i)));
157 list.add(a);
158 }
159
160 Automaton a = Operations.union(list);
161 a = MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
162 return a;
163 }
164
165
166
167
168
169 private Automaton deletionsOf(String s) {
170 List<Automaton> list = new ArrayList<>();
171
172 for (int i = 0; i < s.length(); i++) {
173 Automaton a = Automata.makeString(s.substring(0, i));
174 a = Operations.concatenate(a, Automata.makeString(s.substring(i + 1)));
175 list.add(a);
176 }
177
178 Automaton a = Operations.union(list);
179 a = MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
180 return a;
181 }
182
183
184
185
186
187 private Automaton substitutionsOf(String s) {
188 List<Automaton> list = new ArrayList<>();
189
190 for (int i = 0; i < s.length(); i++) {
191 Automaton a = Automata.makeString(s.substring(0, i));
192 a = Operations.concatenate(a, Automata.makeAnyChar());
193 a = Operations.concatenate(a, Automata.makeString(s.substring(i + 1)));
194 list.add(a);
195 }
196
197 Automaton a = Operations.union(list);
198 a = MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
199 return a;
200 }
201
202
203
204
205
206 private Automaton transpositionsOf(String s) {
207 if (s.length() < 2) {
208 return Automata.makeEmpty();
209 }
210 List<Automaton> list = new ArrayList<>();
211 for (int i = 0; i < s.length()-1; i++) {
212 StringBuilder sb = new StringBuilder();
213 sb.append(s.substring(0, i));
214 sb.append(s.charAt(i+1));
215 sb.append(s.charAt(i));
216 sb.append(s.substring(i+2, s.length()));
217 String st = sb.toString();
218 if (!st.equals(s)) {
219 list.add(Automata.makeString(st));
220 }
221 }
222 Automaton a = Operations.union(list);
223 a = MinimizationOperations.minimize(a, DEFAULT_MAX_DETERMINIZED_STATES);
224 return a;
225 }
226
227 private void assertBruteForce(String input, Automaton dfa, int distance) {
228 CharacterRunAutomaton ra = new CharacterRunAutomaton(dfa);
229 int maxLen = input.length() + distance + 1;
230 int maxNum = (int) Math.pow(2, maxLen);
231 for (int i = 0; i < maxNum; i++) {
232 String encoded = Integer.toString(i, 2);
233 boolean accepts = ra.run(encoded);
234 if (accepts) {
235 assertTrue(getDistance(input, encoded) <= distance);
236 } else {
237 assertTrue(getDistance(input, encoded) > distance);
238 }
239 }
240 }
241
242 private void assertBruteForceT(String input, Automaton dfa, int distance) {
243 CharacterRunAutomaton ra = new CharacterRunAutomaton(dfa);
244 int maxLen = input.length() + distance + 1;
245 int maxNum = (int) Math.pow(2, maxLen);
246 for (int i = 0; i < maxNum; i++) {
247 String encoded = Integer.toString(i, 2);
248 boolean accepts = ra.run(encoded);
249 if (accepts) {
250 assertTrue(getTDistance(input, encoded) <= distance);
251 } else {
252 assertTrue(getTDistance(input, encoded) > distance);
253 }
254 }
255 }
256
257
258
259
260 private int getDistance (String target, String other) {
261 char[] sa;
262 int n;
263 int p[];
264 int d[];
265 int _d[];
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284 sa = target.toCharArray();
285 n = sa.length;
286 p = new int[n+1];
287 d = new int[n+1];
288
289 final int m = other.length();
290 if (n == 0 || m == 0) {
291 if (n == m) {
292 return 0;
293 }
294 else {
295 return Math.max(n, m);
296 }
297 }
298
299
300
301 int i;
302 int j;
303
304 char t_j;
305
306 int cost;
307
308 for (i = 0; i<=n; i++) {
309 p[i] = i;
310 }
311
312 for (j = 1; j<=m; j++) {
313 t_j = other.charAt(j-1);
314 d[0] = j;
315
316 for (i=1; i<=n; i++) {
317 cost = sa[i-1]==t_j ? 0 : 1;
318
319 d[i] = Math.min(Math.min(d[i-1]+1, p[i]+1), p[i-1]+cost);
320 }
321
322
323 _d = p;
324 p = d;
325 d = _d;
326 }
327
328
329
330 return Math.abs(p[n]);
331 }
332
333 private int getTDistance(String target, String other) {
334 char[] sa;
335 int n;
336 int d[][];
337
338 sa = target.toCharArray();
339 n = sa.length;
340 final int m = other.length();
341 d = new int[n+1][m+1];
342
343 if (n == 0 || m == 0) {
344 if (n == m) {
345 return 0;
346 }
347 else {
348 return Math.max(n, m);
349 }
350 }
351
352
353 int i;
354 int j;
355
356 char t_j;
357
358 int cost;
359
360 for (i = 0; i<=n; i++) {
361 d[i][0] = i;
362 }
363
364 for (j = 0; j<=m; j++) {
365 d[0][j] = j;
366 }
367
368 for (j = 1; j<=m; j++) {
369 t_j = other.charAt(j-1);
370
371 for (i=1; i<=n; i++) {
372 cost = sa[i-1]==t_j ? 0 : 1;
373
374 d[i][j] = Math.min(Math.min(d[i-1][j]+1, d[i][j-1]+1), d[i-1][j-1]+cost);
375
376 if (i > 1 && j > 1 && target.charAt(i-1) == other.charAt(j-2) && target.charAt(i-2) == other.charAt(j-1)) {
377 d[i][j] = Math.min(d[i][j], d[i-2][j-2] + cost);
378 }
379 }
380 }
381
382
383
384 return Math.abs(d[n][m]);
385 }
386 }